library(readr)
library(ggplot2)
news <- read_csv("newsdiet.csv")
all <- read_csv("alldiet.csv")
NEWS CLUSTERING
newscols <- 4:96
news[,newscols] <- log(news[,newscols]+1) # columns have huge right tails
news[,newscols] <- apply(news[,newscols],2,scale)
newscols <- c(4:37,39:96) # drop EAmericanNow since entire column is zeroes
drop <- c(7,10,18,21,35,37,39,44,51,77,90,91,96)
drop <- c(drop,57,58,72,78)
num.clusters <- 2:20
F <- double(length(num.clusters))
Rsqr <- double(length(num.clusters))
sse <- double(length(num.clusters))
for (i in num.clusters) {
temp <- kmeans(news[,-c(1:3,38,97:106,drop)], centers = i)
summary(temp)
F[i-num.clusters[1]+1] <- invisible(summary(temp)$F)
Rsqr[i-num.clusters[1]+1] <- invisible(summary(temp)$Rsqr)
sse[i-num.clusters[1]+1] <- sum(temp$withinss)
}
plot(num.clusters,F, main="Pseudo F")
lines(num.clusters,F)

plot(num.clusters,Rsqr, main="R-squared")
lines(num.clusters,Rsqr)

plot(num.clusters,sse,main="Screeplot SSE vs. # Clusters")
lines(num.clusters,sse)

news.kmeans <- kmeans(news[,-c(1:3,38,97:106,drop)], centers = 10, iter.max = 50, nstart = 10)
news.summary <- summary(news.kmeans)
news.summary[1:4]
## $sse
## [1] 16966792
##
## $ssb
## [1] 9342158
##
## $Rsqr
## [1] 0.3550943
##
## $F
## [1] 21460.31
plot(news.kmeans)

news.cluster.names <- c("Avoiders", "Finance", "CBS/ABC/Ent", "Mid Fox", "MSNBC/CNN", "Sports", "High Fox", "Low All", "CNN", "Hounds")
NON-NEWS CLUSTERING
networks <- read_csv("networks2.csv",col_names=FALSE)
allcols <- c(4:204)
all[,allcols] <- log(all[,allcols]+1) # columns have huge right tails
all[,allcols] <- apply(all[,allcols],2,scale)
dropnews <- c(9:10,20:32,38,139:204) # drop news shows & aggregated genres
drop2 <- c(4,37,39:41,46,57,64,67,71,77:78,97:98,100,112,114:116,124,126,129,138) # dropped from PCA/alpha
all.reorder <- all[,-c(1:3,dropnews,drop2)]
all.reorder <- all.reorder[networks$X1]
names(all.reorder) <- paste(networks$X2, names(all.reorder), sep=" ")
all.reorder <- cbind(all[,1:3],all.reorder)
num.clusters <- 2:20
F.all <- double(length(num.clusters))
Rsqr.all <- double(length(num.clusters))
sse.all <- double(length(num.clusters))
for (i in num.clusters) {
temp <- kmeans(all.reorder[,-c(1:3)], centers = i)
summary(temp)
F.all[i-num.clusters[1]+1] <- invisible(summary(temp)$F)
Rsqr.all[i-num.clusters[1]+1] <- invisible(summary(temp)$Rsqr)
sse.all[i-num.clusters[1]+1] <- sum(temp$withinss)
}
plot(num.clusters,F.all, main="Pseudo F")
lines(num.clusters,F.all)

plot(num.clusters,Rsqr.all, main="R-squared")
lines(num.clusters,Rsqr.all)

plot(num.clusters,sse.all,main="Screeplot SSE vs. # Clusters")
lines(num.clusters,sse.all)

all.kmeans <- kmeans(all.reorder[,-c(1:3)], centers = 10, iter.max = 50, nstart = 10) #
all.summary <- summary(all.kmeans)
all.summary[1:4]
## $sse
## [1] 24910516
##
## $ssb
## [1] 8764940
##
## $Rsqr
## [1] 0.2602768
##
## $F
## [1] 13713.71
plot(all.kmeans) #

all.cluster.names <- c("Westerns/Old Shows", "Sitcoms", "Animated", "Talk Shows/Daytime Shows", "Avoiders", "HGTV/Talk Shows", "Above Avg All Shows", "Disney/CN (Kids)", "Below Avg All Shows", "HGTV Only")
CLUSTER CROSSTAB
news.clusters <- cbind(household_id=news$household_id,news.clusters=news.kmeans$cluster)
all.clusters <- cbind(household_id=all$household_id,all.clusters=all.kmeans$cluster)
clusters <- merge(all.clusters,news.clusters)
count.per.cluster <- table(clusters[,2:3])
pct.per.cluster <- round(count.per.cluster/nrow(clusters)*100,3)
pct.per.col <- apply(count.per.cluster,2,function(x) round(x/sum(x)*100,3))
count.per.cluster
## news.clusters
## all.clusters 1 2 3 4 5 6 7 8 9 10
## 1 5662 666 2005 2745 860 2282 3100 6730 2366 1456
## 2 10782 957 1557 3288 1023 7118 2240 6749 2825 1620
## 3 5636 753 3483 2410 700 5258 1353 8181 2916 1214
## 4 2034 1015 8633 1651 1517 2153 2408 8197 4004 2075
## 5 31653 866 110 4389 945 3979 3478 2754 1638 666
## 6 2504 1567 6475 2420 1322 2705 3004 7914 3619 2407
## 7 2957 745 4669 1564 982 1788 1290 4767 3048 2800
## 8 15016 346 428 2138 425 5160 1075 5208 1450 349
## 9 6770 1648 2786 3877 1771 5409 4497 11884 3763 1547
## 10 8132 1240 1006 4170 1066 4198 4078 7130 2386 1187
pct.per.cluster
## news.clusters
## all.clusters 1 2 3 4 5 6 7 8 9 10
## 1 1.614 0.190 0.572 0.783 0.245 0.651 0.884 1.919 0.674 0.415
## 2 3.074 0.273 0.444 0.937 0.292 2.029 0.639 1.924 0.805 0.462
## 3 1.607 0.215 0.993 0.687 0.200 1.499 0.386 2.332 0.831 0.346
## 4 0.580 0.289 2.461 0.471 0.432 0.614 0.686 2.337 1.141 0.592
## 5 9.023 0.247 0.031 1.251 0.269 1.134 0.991 0.785 0.467 0.190
## 6 0.714 0.447 1.846 0.690 0.377 0.771 0.856 2.256 1.032 0.686
## 7 0.843 0.212 1.331 0.446 0.280 0.510 0.368 1.359 0.869 0.798
## 8 4.281 0.099 0.122 0.609 0.121 1.471 0.306 1.485 0.413 0.099
## 9 1.930 0.470 0.794 1.105 0.505 1.542 1.282 3.388 1.073 0.441
## 10 2.318 0.353 0.287 1.189 0.304 1.197 1.163 2.033 0.680 0.338
CHI-SQUARE TEST
chisq.test(count.per.cluster)$stdres
## news.clusters
## all.clusters 1 2 3 4 5 6 7 8 9
## 1 -22.4934375 -4.2765928 -10.3193188 10.6780573 0.6158863 -17.6716554 23.4395881 18.8992268 3.2253665
## 2 10.7209894 -3.5987291 -34.9179459 3.3897620 -4.1563242 47.0837715 -13.2338876 -11.0571145 -4.4508726
## 3 -35.5324177 -4.9372278 13.4119350 -4.2000030 -9.0877100 29.8281154 -23.5271471 27.3804204 7.9722529
## 4 -87.7979820 2.5587067 113.6454475 -23.0271529 16.6622366 -30.5086237 -3.0145320 21.8713504 27.7696856
## 5 203.3375743 -15.8963274 -73.9459810 4.6720902 -16.3440552 -26.9878096 -6.1618149 -87.4800131 -42.4708444
## 6 -82.2332145 21.4372480 69.4955417 -7.3397679 9.8522230 -21.0067969 9.4632169 17.0344090 19.1456158
## 7 -51.8165431 2.2964314 57.7130382 -10.7681947 9.1694350 -21.2391834 -14.2717401 -1.8219115 26.3993592
## 8 91.5384503 -19.2138201 -49.2992233 -9.5323110 -18.2753808 28.7959349 -29.3110145 -15.5804053 -23.3507037
## 9 -54.0800814 12.9881133 -20.0301980 5.3451931 13.1466400 6.2692045 22.6440002 40.6124932 4.7570942
## 10 -11.0589807 9.3895530 -41.1309330 27.8003017 0.6478032 4.4242665 31.3261380 3.9047078 -7.8697092
## news.clusters
## all.clusters 10
## 1 7.2903702
## 2 -1.2373737
## 3 -5.1556355
## 4 16.9269144
## 5 -36.2168584
## 6 25.8443264
## 7 55.8003878
## 8 -29.7507567
## 9 -9.2994629
## 10 -8.9746885
df.heatmap <- expand.grid(all.clusters=all.cluster.names, news.clusters=news.cluster.names)
m <- as.matrix(pct.per.cluster)
q <- c()
for (i in 1:ncol(m)) q <- c(q,m[,i])
df.heatmap$proportion <- as.numeric(q)
df.heatmap$labels <- paste(df.heatmap$proportion,"%", sep="")
Heatmap of overall proportions (sum of all tiles is 100%)
ggplot(data = df.heatmap, aes(x = news.clusters, y = all.clusters)) +
geom_tile(aes(fill = proportion)) +
scale_fill_continuous(low = "white", high = "blue", limits=c(0,10)) +
scale_x_discrete(position = "top") +
geom_text(aes(label = labels))

Heatmap of within-news-clusters proportions (each column sums to 100%)
df.heatmap2 <- expand.grid(all.clusters=all.cluster.names, news.clusters=news.cluster.names)
m <- as.matrix(pct.per.col)
q <- c()
for (i in 1:ncol(m)) q <- c(q,m[,i])
df.heatmap2$proportion <- as.numeric(q)
df.heatmap2$labels <- paste(df.heatmap2$proportion,"%", sep="")
ggplot(data = df.heatmap2, aes(x = news.clusters, y = all.clusters)) +
geom_tile(aes(fill = proportion)) +
scale_fill_continuous(low = "white", high = "blue", limits=c(0,35)) +
scale_x_discrete(position = "top") +
geom_text(aes(label = labels)) +
theme(axis.text=element_text(size=12, face="bold"))
